################################################################################################
## Replication File for 
## "Capturing Clicks: How the Chinese Government Uses Clickbait to Compete for Visibility"
## Yingdan Lu and Jennifer Pan
## April, 2020
################################################################################################

################################################################################################
## Clickbait strategies analysis
################################################################################################

######### Set Up #########
#install.packages("ggplot2")
library(ggplot2) #version 3.2.1
#install.packages("lubridate")
library(lubridate) #version 1.7.4
#install.packages("extrafont")
library(extrafont) #version 0.17
#install.packages("dplyr")
library(dplyr) #version 0.8.1
#install.packages("reshape2")
library(reshape2) #version 1.4.3
#install.packages("boot")
library(boot) #version 1.3-22
#install.packages("cowplot")
library(cowplot) #version 1.0.0

# load fonts
loadfonts()

setwd("..")

# import the data
df <- read.csv("data/sample_posts.csv", header = T, encoding = "UTF-8", stringsAsFactors = F)
nongov <- read.csv("data/nongov_posts.csv", header = T, encoding = "UTF-8", stringsAsFactors = F)

# preprocess the data
df$date_pek <- mdy(df$date_pek)
df$emotional <- ifelse(apply(df[,c("joy","pride","anger","fear", "warmth")] >= 1,1,any),1,0)
df$excl <- ifelse(df$excl_mark >= 1,1,0)
df$que <- ifelse(df$question_mark >= 1,1,0)
df$ell <- ifelse(df$ellipsis_mark >= 1,1,0)
df$pron <- ifelse(df$pronoun_num >= 1,1,0)
df$phrase <- ifelse(df$phrases_num >= 1,1,0)
nongov$excl <- ifelse(nongov$excl_mark >= 1,1,0)
nongov$que <- ifelse(nongov$question_mark >= 1,1,0)
nongov$ell <- ifelse(nongov$ellipsis_mark >= 1,1,0)
nongov$pron <- ifelse(nongov$pronoun_num >= 1,1,0)
nongov$phrase <- ifelse(nongov$phrases_num>= 1,1,0)
df$bait <- ifelse(apply(df[,c("listicles","gennn","hyperbolic","slang","excl", "que",
                              "ell", "pron", "phrase")] >= 1,1,any),1,0)


######### Clickbait statistics #########
# check the proportion of clickbait and appeals
## clickbait proportion
prop.table(table(df$bait))*100
## emotional appeals proportion
prop.table(table(df$emotional))*100
## emotional appeals and vision appeals proportion
prop.table(table(df$emotional | df$vision))*100


######### Figure 4 #########
# subset the columns with clickbait indicators
df_bait1 <- df[,c("excl_mark", "pronoun_num", "phrases_num","slang", "ellipsis_mark",
                  "question_mark","listicles","hyperbolic", "gennn")]

# calculate the percentage of clickbait strategies used in all titles
percent_bait <- {}
for (i in 1:9){
  percent_bait[i] <- round(sum(df_bait1[,i]>0, na.rm = T)/nrow(df), digits = 2)*100
  print(paste0(colnames(df_bait1)[i], ":", percent_bait[i], "%"))
}
percent_bait <- as.data.frame(percent_bait)

# combine the percentage of clickbait strategies with the names of the strategies
strategies <- c("Exclamation\nmark", "  Pronoun", 
                "Fixed\nphrase\npattern", "Slang", "Ellipses\nmark", 
                "Question\nmark","Listicle", "Hyperbolic\nword",
                "General\nnoun")
plot.table <- as.data.frame(cbind(strategies, percent_bait))
plot.table$strategies <- factor(plot.table$strategies, 
                                levels=c("Exclamation\nmark", "  Pronoun", 
                                         "Fixed\nphrase\npattern", "Slang", "Ellipses\nmark", 
                                         "Question\nmark","Listicle", "Hyperbolic\nword",
                                         "General\nnoun"))
# Plot the percentage of each strategy
ggplot(data=plot.table, aes(x=strategies, y=percent_bait)) +
  geom_bar(stat="identity")+
  xlab(" ")+ ylab("Titles with strategy (%)") +
  theme_bw(base_size=16, base_family='Times New Roman') +
  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
  theme(text = element_text(size=16, colour = "black"), 
        axis.title.y = element_text(size=16, colour = "black"),
        axis.title.x = element_text(size=16, colour = "black"), 
        axis.text.x  = element_text(size=16, colour = "black"),
        axis.text.y = element_text(size=16, colour = "black"))+
  geom_text(aes(label=percent_bait), vjust=-0.3, size=5.64, family = "Times New Roman")


######### Figure 5 #########
# select the government data that are from Feb 25 to May 25, 2019
gov <- df[df$date_pek > as.Date("2019-02-24"),]

# calculate the total marks and create a variable indicating titles with more than one mark
gov$moremarks <- ifelse(gov$total_mark >1,1,0)
nongov$moremarks <- ifelse(nongov$total_mark >1,1,0)

# code clickbait with "all clickbait types"
gov$bait <- ifelse(apply(gov[,c("listicles","gennn","hyperbolic","slang","excl", "que",
                                "ell", "pron", "phrase")] >= 1,1,any),1,0)
nongov$bait <- ifelse(apply(nongov[,c("listicles","gennn","hyperbolic","slang","excl", "que",
                                      "ell", "pron", "phrase")] >= 1,1,any),1,0)

# code "clickbait excluding single punctuation marks"
gov$bait1 <- ifelse(apply(gov[,c("listicles","gennn","hyperbolic","slang", "pron",
                                 "moremarks", "phrase")] >= 1,1,any),1,0)
nongov$bait1 <- ifelse(apply(nongov[,c("listicles","gennn","hyperbolic","slang","pron",
                                       "moremarks", "phrase")] >= 1,1,any),1,0)

# code "clickbait excluding all punctuation marks"
gov$bait2 <- ifelse(apply(gov[,c("listicles","gennn","hyperbolic","slang","pron", 
                                 "phrase")] >= 1,1,any),1,0)
nongov$bait2 <- ifelse(apply(nongov[,c("listicles","gennn","hyperbolic","slang", "pron", 
                                       "phrase")] >= 1,1,any),1,0)

# define a function for mean calculation in bootstrapping
my.mean = function(x, indices) {
  return( mean( x[indices] ) )
}

# calculate average clickbait rate of clickbait usage
# and get nonparametric bootstrap confidence intervals
set.seed(120)
nongov_bait <- {}
nongov_boot <- boot.ci(boot(nongov$bait,
                            my.mean, 1000, parallel = "multicore"), index = 1, type=c('norm'))$norm
nongov_bait <- c("Nongov\n(marks>=1)", mean(nongov$bait),
                 nongov_boot[2], nongov_boot[3])

set.seed(120)
nongov_bait1 <- {}
nongov_boot1 <- boot.ci(boot(nongov$bait1,
                             my.mean, 1000, parallel = "multicore"), index = 1, type=c('norm'))$norm
nongov_bait1 <- c("Nongov\n(marks>1)", mean(nongov$bait1),
                  nongov_boot1[2], nongov_boot1[3])

set.seed(120)
nongov_bait2 <- {}
nongov_boot2 <- boot.ci(boot(nongov$bait2,
                             my.mean, 1000, parallel = "multicore"), index = 1, type=c('norm'))$norm
nongov_bait2 <- c("Nongov\n(Exclude mark variables)", mean(nongov$bait2),
                  nongov_boot2[2], nongov_boot2[3])

set.seed(120)
gov_bait <- {}
gov_boot <- boot.ci(boot(gov$bait,
                         my.mean, 1000, parallel = "multicore"), index = 1, type=c('norm'))$norm
gov_bait <- c("Government\n(marks>=1)", mean(gov$bait), gov_boot[2], gov_boot[3])

set.seed(120)
gov_bait1 <- {}
gov_boot1 <- boot.ci(boot(gov$bait1,
                          my.mean, 1000, parallel = "multicore"), index = 1, type=c('norm'))$norm
gov_bait1 <- c("Government\n(marks>1)", mean(gov$bait1), gov_boot1[2], gov_boot1[3])

set.seed(120)
gov_bait2 <- {}
gov_boot2 <- boot.ci(boot(gov$bait2,
                          my.mean, 1000, parallel = "multicore"), index = 1, type=c('norm'))$norm
gov_bait2 <- c("Government\n(Exclude mark variables)", mean(gov$bait2),
               gov_boot2[2], gov_boot2[3])

# combine all data into one dataframe
gov_nongov <- rbind.data.frame(gov_bait, nongov_bait,
                               gov_bait1, nongov_bait1,
                               gov_bait2, nongov_bait2,
                               stringsAsFactors = FALSE)
colnames(gov_nongov) <- c("account", 'mean', "down", "upper")
gov_nongov$mean <- as.numeric(gov_nongov$mean) * 100
gov_nongov$upper <- as.numeric(gov_nongov$upper)* 100
gov_nongov$down <- as.numeric(gov_nongov$down)* 100

# separate three dataframes with different clickbait coding for plotting
gov_nongov_all <- gov_nongov[1:2,]
gov_nongov_moremark <- gov_nongov[3:4,]
gov_nongov_nomark <- gov_nongov[5:6,]

# plot the government vs non-government clickbait comparison
p1 <- ggplot(gov_nongov_all, aes(x=account, y=mean,
                                 ymin=down, ymax=upper, shape = account, colour = account)) +
  geom_pointrange(size = 0.6, fatten = 2)+
  scale_colour_manual(labels = c("government", "non-government"),
                      values = c("black", "#939393"))+
  scale_shape_manual(values = c(20, 17), 
                     labels = c("government", "non-government"))+
  ylim(60,85)+xlab("All clickbait types \n ")+ ylab("Titles with clickbait (%)") +
  theme_bw(base_size=16, base_family='Times New Roman')+
  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
  theme(text = element_text(size=16), axis.title.y = element_text(size=16),
        axis.title.x = element_text(size=16),
        axis.text.x  = element_blank(),
        axis.text.y = element_text(size=16),
        legend.position = c(0.36,0.9), legend.title = element_blank(),
        legend.text =element_text(size=12), legend.background = element_blank())+ 
  guides(shape = guide_legend(override.aes = list(size = 0.3)))

p2 <- ggplot(gov_nongov_moremark, aes(x=account, y=mean,
                                      ymin=down, ymax=upper, shape = account, colour = account)) +
  geom_pointrange(size = 0.6, fatten = 2)+
  scale_colour_manual(labels = c("government", "non-government"),
                      values = c("black", "#939393"))+
  scale_shape_manual(values = c(20, 17), 
                     labels = c("government", "non-government"))+
  ylim(42,67)+xlab("Clickbait excluding \n single punctuation marks")+ ylab("Titles with clickbait (%)") +
  theme_bw(base_size=16, base_family='Times New Roman')+
  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
  theme(text = element_text(size=16), axis.title.y = element_text(size=16),
        axis.title.x = element_text(size=16),
        axis.text.x  = element_blank(),
        axis.text.y = element_text(size=16),
        legend.position = c(0.35,0.9), legend.title = element_blank(),
        legend.text =element_text(size=12), legend.background = element_blank())+ 
  guides(shape = guide_legend(override.aes = list(size = 0.3)))

p3 <- ggplot(gov_nongov_nomark, aes(x=account, y=mean,
                                    ymin=down, ymax=upper, shape = account, colour = account)) +
  geom_pointrange(size = 0.6, fatten = 2)+
  scale_colour_manual(labels = c("government", "non-government"),
                      values = c("black", "#939393"))+
  scale_shape_manual(values = c(20, 17), 
                     labels = c("government", "non-government"))+
  ylim(38,63)+xlab("Clickbait excluding \n all punctuation marks")+ ylab("Titles with clickbait (%)") +
  theme_bw(base_size=16, base_family='Times New Roman')+
  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
  theme(text = element_text(size=16), axis.title.y = element_text(size=16),
        axis.title.x = element_text(size=16),
        axis.text.x  = element_blank(),
        axis.text.y = element_text(size=16),
        legend.position = c(0.35,0.9), legend.title = element_blank(),
        legend.text =element_text(size=12), legend.background = element_blank()
  )+ guides(shape = guide_legend(override.aes = list(size = 0.3)))
plot_grid(p1, p2, p3, labels = c('A', 'B', 'C'), ncol = 3, label_y = 0.2)


######### Figure 6 #########
# group posts by day and calculate the average clickbait and emotional appeals per day
group_bait_day = as.data.frame(df %>%
                                 group_by(df$date_pek)%>%
                                 summarise(rate = mean(bait)))
group_emotional_day = as.data.frame(df %>%
                                      group_by(df$date_pek)%>%
                                      summarise(rate = mean(emotional)))
colnames(group_bait_day)[1] <- "date"
colnames(group_emotional_day)[1] <- "date"

# merge the values into one dataset and melt for plotting
a_total <- merge(group_bait_day, group_emotional_day, by="date")
colnames(a_total)[c(2,3)] <- c("clickbait_rate", "emotional_rate")
aMelted <- melt(a_total, id.var='date')

# plot clickbaits and appeals in one plot 
ggplot(aMelted, aes(x=as.Date(date), y=value))+
  geom_line(aes(linetype=variable), size = 1) + 
  scale_x_date(date_labels = "%b", date_breaks = "1 month")+
  theme_bw(base_size=16, base_family='Times New Roman') +
  annotate("text", y=0.9, x= as.Date("2019-05-01"), label= "Clickbait",family = "Times New Roman", size = 5.64) +
  annotate("text", y=0.3, x= as.Date("2019-05-01"), label= "Emotional \nappeals",family = "Times New Roman", size = 5.64) +
  ylab("Titles with strategy (%)")+xlab("2018-2019")+
  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
  theme(text = element_text(size=16, colour = "black"),
        axis.title.y = element_text(size=16, colour = "black"),
        axis.title.x = element_text(size=16, colour = "black"), 
        axis.text.x  = element_text(size=16, colour = "black"),
        axis.text.y = element_text(size=16, colour = "black"), 
        legend.position="none")
